import asyncio
import json
from functools import partial
from typing import Dict, List, Literal

import aiohttp
from bs4 import BeautifulSoup
from codetiming import Timer
from tqdm.asyncio import tqdm_asyncio

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.76"
}


async def getDocTableById(data_docId: str) -> Dict[str, str]:
    url = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectByDocId/{data_docId=}.json"
    table: Dict[str, str] = {}
    async with aiohttp.request("GET", url, headers=headers) as response:
        if response.status != 200:
            return table
        data = await response.json()
    soup = BeautifulSoup(data["data"]["docClob"], "html.parser")
    for tr in soup.select("tr"):
        key = tr.select_one("td:nth-child(1)").text.strip()
        val_elms = tr.select_one("td:nth-child(2)").select("p")
        if val_elms is None:
            table[key] = ""
            continue
        value = [p.text for p in tr.select_one("td:nth-child(2)").select("p")]
        table[key] = value[0] if len(value) == 1 else ";".join(value)
    return table


APIName = Literal["SelectDocItemByItemPId", "SelectDocByItemIdAndChild"]


async def getDocIds(
    pageIndex: int,
    pageSize: int = 18,
    itemId: int = 4114,
    apiName: APIName = "SelectDocByItemIdAndChild",
) -> List[str]:
    url = f"https://www.cbirc.gov.cn/cbircweb/DocInfo/{apiName}?{itemId=}&{pageSize=}&{pageIndex=}"
    async with aiohttp.request("GET", url, headers=headers) as response:
        data = await response.json()
    docInfos = data["data"]["rows"]
    return list(map(lambda x: x["docId"], docInfos))


async def getDocTables(
    nPages: int = 8,
    pageSize: int = 18,
    itemId: int = 4114,
    apiName: APIName = "SelectDocByItemIdAndChild",
) -> List[Dict[str, str]]:
    """
    获取文档表格内容

    参数
    ----
        - nPages: 页数 (int 类型，默认值: 8)
        - pageSize: 每页个数 (int 类型，默认值: 18)
        - itemId: 项目编号 (int 类型，默认值: 4114)
        - apiName: API 名称 (APIName 类型，默认值: "SelectDocByItemIdAndChild")

    返回
    ----
        - tables: 表格内容 (List[Dict[str, str]] 类型)

    """
    pageIndices = range(1, nPages + 1)
    print("正在获取文号列表...")
    docIds = await tqdm_asyncio.gather(
        *map(
            partial(getDocIds, pageSize=pageSize, itemId=itemId, apiName=apiName),
            pageIndices,
        )
    )
    docIds = sum(docIds, [])
    print("获取文号列表完成.")
    print("正在获取文档表格内容...")
    tables = await tqdm_asyncio.gather(*map(getDocTableById, docIds))
    print("获取文档表格内容完成.")
    return tables


if __name__ == "__main__":
    with Timer(text="总共耗时: {:.2f}s"):
        tables = asyncio.run(
            getDocTables(apiName="SelectDocItemByItemPId", itemId=1855)
        )
    with open("data/京金罚决字.json", "w", encoding="utf-8") as f:
        json.dump(tables, f, ensure_ascii=False)
